rm(list = ls())
library(tidyverse)
library(caret)
library(haven)
library(stringr)
library(survey)
library(splines)
library(rlang)
library(lmtest)
library(sandwich)
library(broom)
library(MASS)
library(matrixStats)
library(dplyr)
library(haven)

set.seed(02138)
#set your own directory!

#setwd("~/Dropbox (Harvard University)/Gov 2001 Rep Paper/R Scripts") 
#setwd("~/Dropbox (Harvard University)/Gov_2001_Rep/R Scripts") 

##data prep

#create dyads
s <- read_csv("Datasets/Direct Contig/contdird.csv") %>% dplyr::select(state1ab, state1no) %>% distinct()
s1 <- s %>% dplyr::slice(rep(1:n(), each=215))
s2<- s %>% dplyr::slice(rep(row_number(), 215)) %>% rename(state2ab = state1ab, state2no = state1no)

dyads <- cbind(s1,s2) 
toDelete <- seq(1, nrow(dyads), 216)
dyads <-  dyads[-toDelete, ]

nyears <- seq(1918, 2001,1)

year <- rep(nyears, each = length(dyads$state1ab))

s3<- dyads %>% dplyr::slice(rep(row_number(), length(nyears)))

dyads_final <- cbind(s3,year) 


##dyads and years 

#filter out dyads that were involved in MIDs and CTs 

years <- seq(1918,2001,1)

# dyad mids
mids_dyads <-read_csv("Datasets/MIDs Dyads/dyadic MIDs 3.1.csv")
d<- mids_dyads %>% dplyr::filter(row_number() %% 2 == 1) %>% dplyr::filter(row_number() %% 2 == 1)%>%filter(strtyr %in% years) #1810 observations 
d<- d%>% filter(hihost %in% seq(4,5,1)) #1323 obs

mids <- d %>% dplyr::select(statea,namea,stateb,nameb,year) %>% rename(state1ab = namea, state1no =  statea, state2ab = nameb, state2no = stateb)

mct <- readRDS("d2.RDS") %>%dplyr::select(country_code_a,ccode_b,country_text_a,cabb_b,year)  %>% 
  rename(state1ab = country_text_a, state1no =  country_code_a, state2ab = cabb_b, state2no = ccode_b)


dyads2 <- setdiff(dyads_final,mids)

dyads_f <- setdiff(dyads2,mct)


##https://statisticsglobe.com/setdiff-r-function/
  


##merge on Xs for stratification?

#contig 

contig <- read_csv("Datasets/Direct Contig/contdird.csv") %>%
  dplyr::select(state1ab, state2ab, state1no, state2no, conttype,year) %>% distinct
contig <- contig%>% mutate(contig, conttype = if_else(conttype > 0 & conttype <5, 1,0))

saveRDS(contig, "contig.RDS")

contig <- readRDS("contig.RDS")

datasets<- c("contig","alliance")
# duplicates <- data.frame(no = 1)
# for (dat in datasets) {
#   duplicates <- duplicates  %>% mutate(!!sym(str_c(data, dyads_f)) := sum(duplicated(eval(parse_expr(dat))))) }
# duplicates

#join 
dyads_f <- dyads_f %>% left_join(., dplyr::select(contig, - c(state2ab, state1ab)), by=c("state2no", "state1no", "year"))
                                
  dyads_f[is.na(dyads_f)] <- 0 # dataset doesnt record any  dyads that dont share borders

c <- dyads_f%>%filter(conttype==1)

#alliance portfolio 

alliance <- read_dta("Datasets/Alliance-retry/atop-sscore (1).dta")%>% dplyr::select(ccode1,cabb1, ccode2, cabb2,  year, s_wt_atop )%>%
  rename(state1ab = cabb1, state2ab = cabb2,state1no = ccode1, state2no = ccode2, score = s_wt_atop)

dyads_f <- dyads_f %>% left_join(., alliance, by=c("state1no", "state2no", "state2ab", "state1ab", "year"))
                                 

##to summarize, I included indicators on alliance score, trade flows, and contiguity to exclude dyads that have close to no interactions (e.g. St. Lucia and North Korea)


##number of control obs we need per year

n <- vector()

yr <- seq(1918,2001,1)

for (i in 1:length(yr)){
 n[i] <- sum(as.numeric(mct$year==yr[i]))
}


number <- 5*n

cn <- data.frame(cbind(number,yr))

control_number <- cn %>%dplyr::filter(number>0) # sum(control_number$number) : 905

y <- sum(as.numeric((control_number$number>0)))
n_y <- as.numeric(control_number$yr)
                  

##70% dyads that are contiguous, 30% that are not (contiguity is a predictor of interaction)
##it might make sense to just drop dyads that have no recorded alliance scores and trade flows, 
##because that probably means they don't have enough interactions to begin with 

cgc <- na.omit(dyads_f)  ##cgc - control group candidates

cg<- split(cgc,cgc$year %in% n_y, drop=FALSE)
cg_t <- cg$"TRUE"

cg_t <- split(cg_t, cg_t$year) 

a <- list()
for (i in 1:length(cg_t)){
  
  a[i] <- cg_t[i]
}


##generate random samples for each year blocked by contiguity status, with the n for each year detailed in the control_number dataframe 
# aleksei 07/12: changed proportions to 0.2, 0.8 for EXPERIMENTATION + filtering noncontiguous countries' alliance above MEDIAN
### NOTE: MCT (from 02a) seems to have 242 obs

#summary(alliance$score)
# min -0.8935       1st qu 0.4343         median 0.5956     3rd qu   0.8968     max 1.0000 

sampled <- list()


for (i in 1:length(a)){
  d <- data.frame(a[i])
  yr_c1 <-d %>%filter(conttype==1)
  yr_c0 <- d%>%filter(conttype==0, score >= 0.5956)
  s_c1<-yr_c1[sample(nrow(yr_c1), control_number$number[i]*0.5), ] ##can change prop later if needed 
  s_c0<-yr_c0[sample(nrow(yr_c0), control_number$number[i]*0.5), ] ##can change prop later if needed 
  s <- rbind(s_c1,s_c0)
  sampled [[i]] <- s
}


###check to make sure 

sampled[[1]] ##test
length(sampled)


h <- vector()

for (i in 1:length(sampled)){
  
  h[i] <- nrow(sampled[[i]])
  
}

sum(h) #905 obs in control ( 5 times the MCT obs)


##sampled is a list of all the control group of all the years (stored as data frames)

sampled_control_group <- reduce(sampled, bind_rows)
saveRDS(sampled_control_group, file = "sampled_control_group.RDS")


##trade dependency - DROPPED 12/14

#trade <- read_csv("Datasets/COW_Trade_4.0/Dyadic_COW_4.0.csv") %>% dplyr::select(ccode1,ccode2, year, smoothtotrade) %>% 
#  rename(state1no = ccode1, state2no = ccode2,trade = smoothtotrade)

